This notebook supports the slides of the tutorial on "Learning from Imbalanced Data" that was presented in the 14th International Conference on Information, Intelligence, Systems and Applications (IISA 2023), University of Thessaly, Volos, Greece.
This part:
Author: Leonidas Akritidis
Last update: 10/07/2023
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
# This option slows down the entire notebook dramatically. All times are 2x - 3x greater
# sns.set_theme()
# Used for creating training and test sets
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Classification models
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Evaluation Measures
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, classification_report
seed=42
num_samples=1000
imbalance_ratio = [0.95,0.05]
num_classes=len(imbalance_ratio)
oversampling_ratio=1.0
np.random.seed(seed)
The experiments will be based on a synthetic dataset generated with scikit-learn's make_classification function. We will build a test dataset with num_samples two-dimensional examples and len(imbalance_ratio) classes. The imbalance_ratio determines the imbalance between the majority and the minorty class/es.
dataset = make_classification(n_samples=num_samples, n_features=2, n_clusters_per_class=2,
n_classes=num_classes, weights=imbalance_ratio, flip_y=0, class_sep=0.5,
n_informative=2, n_redundant=0, n_repeated=0, random_state=seed)
X = dataset[0]
y = dataset[1]
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
<AxesSubplot:>
Print the distribution of samples in the dataset classes:
print("Class Distribution:")
for k in range(num_classes):
print("Class", k, ":", len(y[y==k]), "samples")
Class Distribution: Class 0 : 950 samples Class 1 : 50 samples
We prepare the dataset for classification. We split it into a training and a test set in a ratio $4:1$; in other words, 80% of the original data will be used for training, and 20% will be used for testing. Stratifying by the target variable guarantees that the distributions of the dataset classes will be (about) the same in the training and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)
Print and plot the distribution of samples in the classes of the test set:
for k in range(num_classes):
print("Class", k, ":", len(y_test[y_test==k]), "samples")
sns.countplot(x=y_test)
plt.title('Class distribution in the test set\n', fontsize=14)
Class 0 : 190 samples Class 1 : 10 samples
Text(0.5, 1.0, 'Class distribution in the test set\n')
We will use five simple machine learning models to perform classification:
ReLU activation function.models = {
"Logistic Regression": LogisticRegression(max_iter=300, random_state=seed),
"Decision Tree": DecisionTreeClassifier(criterion='gini', max_depth=None, max_features=None, random_state=seed),
"Random Forest": RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, max_features='sqrt',
n_jobs=8, random_state=seed),
"SVM (RBF Kernel)": SVC(kernel='rbf', C=1, random_state=seed),
"FCFF Neural Net": MLPClassifier(activation='relu', hidden_layer_sizes=(16, 4), solver='adam', random_state=seed),
}
Results_over = []
plot_decision_regions_2D(): The following function plots the dataset and the decision boundaries that are determined by each classifier.
def plot_decision_regions_2D(X, y, classifier, resolution=0.02):
# setup marker generator and color map
markers = ('o', 'o', 'o', 'o', 's', 'v', '^')
colors = ('#1f77b4', '#ff7f0e', 'red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 0.2, X[:, 0].max() + 0.2
x2_min, x2_max = X[:, 1].min() - 0.2, X[:, 1].max() + 0.2
# meshgrid: Return coordinate matrices from coordinate vectors. More specifically, we make N-D coordinate arrays
# for vectorized evaluations of N-D scalar/vector fields over N-D grids, given one-dimensional coordinate arrays.
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
# ravel: Return a contiguous flattened array.
# T: the transpose matrix
X_test_in = np.array([xx1.ravel(), xx2.ravel()]).T
# print(X_test)
Z = classifier.predict(X_test_in)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.15, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y==cl,0], y=X[y==cl,1], alpha=0.8, c=colors[idx], marker=markers[idx], label=cl, edgecolor='white')
TrainTestModel(): A parametric function that trains and tests a classifier.
def TrainTestModel(train_X, test_X, train_y, test_y, mdl, res, desc):
t0 = time.time()
print("Training", mdl + "...\t", end="", flush=True)
clf = models[mdl]
clf.fit(train_X, train_y)
print(" (%5.3f sec). \t" % (time.time() - t0), end="", flush=True)
y_predicted = clf.predict(test_X)
acc = accuracy_score(test_y, y_predicted)
bacc = balanced_accuracy_score(test_y, y_predicted)
if num_classes == 2:
fpr, tpr, thresholds = roc_curve(test_y, y_predicted)
auc_m = auc(fpr, tpr)
else:
auc_m = bacc
print("Accuracy=%5.4f" % acc, "\tBalanced Accuracy=%5.4f"% bacc, "\tAUC=%5.4f" % auc_m, flush=True)
res.append([mdl, desc, acc, bacc, auc_m])
X_stacked = np.vstack((train_X, test_X))
Y_stacked = np.hstack((train_y, test_y))
plot_decision_regions_2D(X_stacked, Y_stacked, clf)
plt.xlabel("x0")
plt.ylabel("x1")
plt.title(mdl + " (Accuracy: " + (str)(round(acc, 3)) + ", AUC: " + (str)(round(auc_m, 3)) + ")")
plt.legend(loc='upper left')
fig = plt.figure(figsize=(18, 10))
itr = 0
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_train, X_test, y_train, y_test, model, Results_over, "Original data")
plt.subplots_adjust(hspace=0.25)
Results_under = Results_over.copy()
Training Logistic Regression... (0.016 sec). Accuracy=0.9500 Balanced Accuracy=0.5000 AUC=0.5000 Training Decision Tree... (0.000 sec). Accuracy=0.9600 Balanced Accuracy=0.6947 AUC=0.6947 Training Random Forest... (0.085 sec). Accuracy=0.9650 Balanced Accuracy=0.6500 AUC=0.6500 Training SVM (RBF Kernel)... (0.000 sec). Accuracy=0.9600 Balanced Accuracy=0.6000 AUC=0.6000 Training FCFF Neural Net... (0.132 sec). Accuracy=0.9550 Balanced Accuracy=0.5500 AUC=0.5500
The sampling_strategy parameter is defined by the ratio $\alpha={N_{min}}/{N_{maj}}$ where:
In the imbalanced dataset, $\alpha$ is:
import imblearn
print(imblearn.__version__)
alpha = round(imbalance_ratio[1] / imbalance_ratio[0],2)
0.10.1
Oversampling refers to pre-processing strategies that enrich the minority classes with artificial samples, aiming at alleviating the problem of class imbalance. This simplistic definition includes a surprising number of oversampling approaches, such as typical data mining algorithms and heuristics, boosting, bagging, deep generative models, and so on. The figure below depicts a deeper categorization into two major groups: traditional synthetic sampling and algorithm centered techniques.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)
The simplest approach involves duplicating examples in the minority class, although these examples don't add any new information to the model. Instead, new examples can be synthesized from the existing examples.
from imblearn.over_sampling import RandomOverSampler
# sampling_strategy: we oversample the minority class to make it equal to the majority class
oversample = RandomOverSampler(sampling_strategy='auto')
X_over_train, y_over_train = oversample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_over_train, X_test, y_over_train, y_test, model, Results_over, "ROS")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.7750 Balanced Accuracy=0.7868 AUC=0.7868 Training Decision Tree... (0.000 sec). Accuracy=0.9600 Balanced Accuracy=0.6947 AUC=0.6947 Training Random Forest... (0.069 sec). Accuracy=0.9650 Balanced Accuracy=0.6974 AUC=0.6974 Training SVM (RBF Kernel)... (0.031 sec). Accuracy=0.9450 Balanced Accuracy=0.7342 AUC=0.7342 Training FCFF Neural Net... (0.266 sec). Accuracy=0.9350 Balanced Accuracy=0.8237 AUC=0.8237
This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or SMOTE for short. The algorithm goes through the following steps:
This procedure can be used to create as many synthetic examples for the minority class as are required. The main advantage of SMOTE it that the synthetic minority examples are relatively close in feature space to existing examples from the minority class. Therefore, the class distribution is not affected much.
However, the synthetic examples are created without considering the majority class, possibly resulting in ambiguous examples if there is a strong overlap for the classes.
from imblearn.over_sampling import SMOTE
# sampling_strategy: we oversample the minority class to make it equal to the majority class
oversample = SMOTE(sampling_strategy='auto')
X_over_train, y_over_train = oversample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_over_train, X_test, y_over_train, y_test, model, Results_over, "SMOTE")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.7700 Balanced Accuracy=0.7842 AUC=0.7842 Training Decision Tree... (0.000 sec). Accuracy=0.8550 Balanced Accuracy=0.6868 AUC=0.6868 Training Random Forest... (0.085 sec). Accuracy=0.8550 Balanced Accuracy=0.7342 AUC=0.7342 Training SVM (RBF Kernel)... (0.031 sec). Accuracy=0.9400 Balanced Accuracy=0.7789 AUC=0.7789 Training FCFF Neural Net... (0.250 sec). Accuracy=0.9500 Balanced Accuracy=0.7842 AUC=0.7842
SMOTE does not care about the "nature" of the minority samples to be used for synthetic data generation. Consequently, it may use outliers for synthetic data generation. This issue is effectively addressed by the Borderline-SMOTE variant.
More specifically, each sample $x_i$ from the minority class is categorized as:
Borderline SMOTE uses only the samples "in danger" to generate new samples. It finds their $k$-nearest neighbors and generates samples over the line that connects them. The neighboring points must belong to the same class as the sample "in danger".
from imblearn.over_sampling import BorderlineSMOTE
# sampling_strategy: we oversample the minority class to have 10 percent the number of examples of the majority class
oversample = BorderlineSMOTE(sampling_strategy='auto', random_state=seed)
X_over_train, y_over_train = oversample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_over_train, X_test, y_over_train, y_test, model, Results_over, "Borderline-SMOTE")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.016 sec). Accuracy=0.8200 Balanced Accuracy=0.7632 AUC=0.7632 Training Decision Tree... (0.000 sec). Accuracy=0.9250 Balanced Accuracy=0.8184 AUC=0.8184 Training Random Forest... (0.069 sec). Accuracy=0.9500 Balanced Accuracy=0.7842 AUC=0.7842 Training SVM (RBF Kernel)... (0.016 sec). Accuracy=0.9450 Balanced Accuracy=0.7816 AUC=0.7816 Training FCFF Neural Net... (0.250 sec). Accuracy=0.9600 Balanced Accuracy=0.8368 AUC=0.8368
Unlike SMOTE and Borderline SMOTE that use $k$-nearest neighbors algorithm to generate synthetic samples, this technique uses an SVM classifier. The classifier is initially trained on the original training set. Then, synthetic data are randomly created along the lines joining each minority class support vector with a number of its nearest neighbors.
With SMOTE=SVM more data points are synthesized away from the region of class overlap. In other words, this technique focuses more on the region where the data is separated.
from imblearn.over_sampling import SVMSMOTE
# sampling_strategy: we oversample the minority class to have 10 percent the number of examples of the majority class
oversample = SVMSMOTE(sampling_strategy='auto', random_state=seed)
X_over_train, y_over_train = oversample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_over_train, X_test, y_over_train, y_test, model, Results_over, "SMOTE-SVM")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.016 sec). Accuracy=0.8450 Balanced Accuracy=0.8237 AUC=0.8237 Training Decision Tree... (0.000 sec). Accuracy=0.8950 Balanced Accuracy=0.6132 AUC=0.6132 Training Random Forest... (0.085 sec). Accuracy=0.9550 Balanced Accuracy=0.7395 AUC=0.7395 Training SVM (RBF Kernel)... (0.016 sec). Accuracy=0.9500 Balanced Accuracy=0.7368 AUC=0.7368 Training FCFF Neural Net... (0.234 sec). Accuracy=0.9600 Balanced Accuracy=0.7895 AUC=0.7895
$k$-Means SMOTE employs the $k$-Means clustering algorithm in conjunction with SMOTE to rebalance the dataset. It generates instances according to the cluster density. In this way, it confronts the intra-class imbalance. This method avoids the generation of noise in the safe area. It also pays attention to both inter-class and intra-class imbalance.
The algorithm is organized in three steps:
This process is repeated until the desired number of minority instances is eventually created.
from imblearn.over_sampling import KMeansSMOTE
# sampling_strategy: we oversample the minority class to have 10 percent the number of examples of the majority class
oversample = KMeansSMOTE(sampling_strategy='auto', cluster_balance_threshold=0.01, random_state=seed)
X_over_train, y_over_train = oversample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_over_train, X_test, y_over_train, y_test, model, Results_over, "kMeans-SMOTE")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.8150 Balanced Accuracy=0.7132 AUC=0.7132 Training Decision Tree... (0.000 sec). Accuracy=0.9050 Balanced Accuracy=0.6658 AUC=0.6658 Training Random Forest... (0.078 sec). Accuracy=0.9200 Balanced Accuracy=0.7211 AUC=0.7211 Training SVM (RBF Kernel)... (0.016 sec). Accuracy=0.9400 Balanced Accuracy=0.6842 AUC=0.6842 Training FCFF Neural Net... (0.242 sec). Accuracy=0.9350 Balanced Accuracy=0.7763 AUC=0.7763
ADASYN determines the number of synthetic examples that need to be generated for each minority example by the amount of its majority nearest neighbors. The more majority nearest neighbors, the more synthetic examples will be created.
The menthod is based on a density distribution $\Gamma_i$ as a criterion to decide the number of synthetic examples; while in SMOTE, each minority example has an equal chance of being selected for synthetic process.
from imblearn.over_sampling import ADASYN
# sampling_strategy: we oversample the minority class to have 10 percent the number of examples of the majority class
oversample = ADASYN(sampling_strategy='auto', random_state=seed)
X_over_train, y_over_train = oversample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_over_train, X_test, y_over_train, y_test, model, Results_over, "ADASYN")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.7300 Balanced Accuracy=0.8579 AUC=0.8579 Training Decision Tree... (0.000 sec). Accuracy=0.8350 Balanced Accuracy=0.7711 AUC=0.7711 Training Random Forest... (0.085 sec). Accuracy=0.8450 Balanced Accuracy=0.7289 AUC=0.7289 Training SVM (RBF Kernel)... (0.016 sec). Accuracy=0.8850 Balanced Accuracy=0.7026 AUC=0.7026 Training FCFF Neural Net... (0.250 sec). Accuracy=0.9100 Balanced Accuracy=0.8579 AUC=0.8579
Let's compare the Accuracy and Balanced Accuracy measurements for all 5 classifiers and for all 7 attested oversampling techniques:
df_results = pd.DataFrame(Results_over, columns=['Classifier', 'Method', 'Accuracy', 'BalancedAccuracy', 'AUC']). \
sort_values(['Classifier'], ascending=[True])
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 1, 1)
ax = sns.barplot(data=df_results, x="Classifier", y="Accuracy", hue="Method", edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.07), loc='right', borderaxespad=0, ncol=7)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplot(2, 1, 2)
ax = sns.barplot(data=df_results, x="Classifier", y="BalancedAccuracy", hue="Method", edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.07), loc='right', borderaxespad=0, ncol=7)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplots_adjust(hspace=0.35)
Α preprocessing technique that brings balance to a dataset by reducing the population of the majority class. We mainly encounter two approaches:
The simplest Prototype Selection undersampling technique. It randomly selects a subset of data for the targeted classes.
from imblearn.under_sampling import RandomUnderSampler
# sampling_strategy: we oversample the minority class to have 10 percent the number of examples of the majority class
undersample = RandomUnderSampler(sampling_strategy='auto', replacement=True, random_state=seed)
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_under_train, X_test, y_under_train, y_test, model, Results_under, "Random Undersampling")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.8050 Balanced Accuracy=0.8026 AUC=0.8026 Training Decision Tree... (0.000 sec). Accuracy=0.6950 Balanced Accuracy=0.6974 AUC=0.6974 Training Random Forest... (0.069 sec). Accuracy=0.8100 Balanced Accuracy=0.8526 AUC=0.8526 Training SVM (RBF Kernel)... (0.000 sec). Accuracy=0.8950 Balanced Accuracy=0.7553 AUC=0.7553 Training FCFF Neural Net... (0.039 sec). Accuracy=0.8250 Balanced Accuracy=0.7658 AUC=0.7658
NearMiss-1 selects the positive samples for which the average distance to the $k$ closest samples of the negative class is the smallest.
from imblearn.under_sampling import NearMiss
undersample = NearMiss(version=1, sampling_strategy='auto', n_neighbors=5)
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_under_train, X_test, y_under_train, y_test, model, Results_under, "NearMiss-1")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.8200 Balanced Accuracy=0.7632 AUC=0.7632 Training Decision Tree... (0.000 sec). Accuracy=0.2050 Balanced Accuracy=0.4868 AUC=0.4868 Training Random Forest... (0.053 sec). Accuracy=0.1950 Balanced Accuracy=0.4816 AUC=0.4816 Training SVM (RBF Kernel)... (0.000 sec). Accuracy=0.4400 Balanced Accuracy=0.4684 AUC=0.4684 Training FCFF Neural Net... (0.031 sec). Accuracy=0.4550 Balanced Accuracy=0.5711 AUC=0.5711
NearMiss-2 selects the positive samples for which the average distance to the $k$ farthest samples of the negative class is the smallest.
undersample = NearMiss(version=2, sampling_strategy='auto', n_neighbors=5)
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_under_train, X_test, y_under_train, y_test, model, Results_under, "NearMiss-2")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.4850 Balanced Accuracy=0.4447 AUC=0.4447 Training Decision Tree... (0.000 sec). Accuracy=0.1400 Balanced Accuracy=0.5474 AUC=0.5474 Training Random Forest... (0.062 sec). Accuracy=0.1400 Balanced Accuracy=0.4526 AUC=0.4526 Training SVM (RBF Kernel)... (0.000 sec). Accuracy=0.2050 Balanced Accuracy=0.4395 AUC=0.4395 Training FCFF Neural Net... (0.031 sec). Accuracy=0.2350 Balanced Accuracy=0.3605 AUC=0.3605
NearMiss-3 is a 2-step algorithm.
undersample = NearMiss(version=3, sampling_strategy='auto', n_neighbors=5)
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_under_train, X_test, y_under_train, y_test, model, Results_under, "NearMiss-3")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.8100 Balanced Accuracy=0.7579 AUC=0.7579 Training Decision Tree... (0.000 sec). Accuracy=0.4650 Balanced Accuracy=0.5763 AUC=0.5763 Training Random Forest... (0.053 sec). Accuracy=0.3600 Balanced Accuracy=0.6158 AUC=0.6158 Training SVM (RBF Kernel)... (0.000 sec). Accuracy=0.5400 Balanced Accuracy=0.7579 AUC=0.7579 Training FCFF Neural Net... (0.038 sec). Accuracy=0.5750 Balanced Accuracy=0.7763 AUC=0.7763
Clustering is another effective way for undersampling the majority class on imbalanced datasets. The idea is that these algorithms construct homogeneous, complete clusters that group together similar data points. Therefore, an entire cluster of elements can be effectively replaced by a single representative element, e.g., cluster center, centroid, clustroid, etc. This key concept renders clustering an attractive approach to undersampling, because it enables the replacement of entire sample groups by a single, representative data point.
Lin et al. proposed two clustering strategies for undersampling, both based on the well-known 𝑘-Means algorithm. In the first strategy, the number of clusters to be constructed is determined by the population of the minority samples. The centroids of the generated clusters are subsequently utilized to replace the entire majority class, rendering the dataset perfectly balanced. The second strategy is very similar to the first one, except that we do not use the centroids as representative points, but their nearest neighbor. To demonstrate the effectiveness of their methods, the authors employed 44 small and 2 large datasets to train 5 different classifiers.
from imblearn.under_sampling import ClusterCentroids
from sklearn.cluster import KMeans
clusterer = KMeans(n_clusters=8, init='random', random_state=seed)
undersample = ClusterCentroids(estimator=clusterer, sampling_strategy='auto', random_state=seed)
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 3, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, legend=False)
itr = 1
for model in models:
itr = itr + 1
plt.subplot(2, 3, itr)
TrainTestModel(X_under_train, X_test, y_under_train, y_test, model, Results_under, "Undersampling (clustering)")
plt.subplots_adjust(hspace=0.25)
Training Logistic Regression... (0.000 sec). Accuracy=0.7550 Balanced Accuracy=0.7763 AUC=0.7763 Training Decision Tree... (0.000 sec). Accuracy=0.7700 Balanced Accuracy=0.7842 AUC=0.7842 Training Random Forest... (0.069 sec). Accuracy=0.8900 Balanced Accuracy=0.8474 AUC=0.8474 Training SVM (RBF Kernel)... (0.000 sec). Accuracy=0.8450 Balanced Accuracy=0.8237 AUC=0.8237 Training FCFF Neural Net... (0.038 sec). Accuracy=0.8300 Balanced Accuracy=0.8158 AUC=0.8158
df_results = pd.DataFrame(Results_under, columns=['Classifier', 'Method', 'Accuracy', 'BalancedAccuracy', 'AUC']). \
sort_values(['Classifier'], ascending=[True])
fig = plt.figure(figsize=(18, 10))
plt.subplot(2, 1, 1)
ax = sns.barplot(data=df_results, x="Classifier", y="Accuracy", hue="Method", edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.07), loc='right', borderaxespad=0, ncol=6)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplot(2, 1, 2)
ax = sns.barplot(data=df_results, x="Classifier", y="BalancedAccuracy", hue="Method", edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.07), loc='right', borderaxespad=0, ncol=6)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplots_adjust(hspace=0.35)
Clustering ($k$-Means) and Random undersampling seem to be the best undersampling techniques; they combine both improved Balanced Accuracy and Accuracy. On the other hand, NearMiss has the worst method in all three versions. Their Accuracy in the final dataset was very low and the same applies to Balanced Accuracy.
Hybrid sampling combines both oversampling and undersampling to mitigate class imbalance.
Two strategies:
This strategy first shrinks the majority class by using an undersampling technique, and then it enlarges the minority classes by applying oversampling.
The code block below defines two lists: undersamplers and oversamplers. They both contain tuples of three elements (x, y, z) where:
x is a string that represents the undersampling/oversampling method,y is the imbalance ratio expressed as ${N_{min}}/{N_{maj}}$ where ${N_{min}}$ is the number of the minority samples and
${N_{maj}}$ is the number of the majority samples,z is the object of the corresponding undersampling/oversampling method.# Undersampling techniques: A list of tuples of three elements: [ (Method Description, Imbalance Ratio, Algorithm Object) ]
step = 0.19
undersamplers = []
lst = [ ("RUS-" + str(int(imb_ratio*100)), imb_ratio, RandomUnderSampler(sampling_strategy=imb_ratio, replacement=True, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
undersamplers.extend(lst)
lst = [ ("NM1-" + str(int(imb_ratio*100)), imb_ratio, NearMiss(version=1, sampling_strategy=imb_ratio, n_neighbors=5)) for imb_ratio in np.arange(alpha, 1.01, step) ]
undersamplers.extend(lst)
lst = [ ("NM2-" + str(int(imb_ratio*100)), imb_ratio, NearMiss(version=2, sampling_strategy=imb_ratio, n_neighbors=5)) for imb_ratio in np.arange(alpha, 1.01, step) ]
undersamplers.extend(lst)
lst = [ ("NM3-" + str(int(imb_ratio*100)), imb_ratio, NearMiss(version=3, sampling_strategy=imb_ratio, n_neighbors=5)) for imb_ratio in np.arange(alpha, 1.01, step) ]
undersamplers.extend(lst)
lst = [ ("CLUS-" + str(int(imb_ratio*100)), imb_ratio, ClusterCentroids(estimator=clusterer, sampling_strategy=imb_ratio, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
undersamplers.extend(lst)
# print(undersamplers)
# Oversampling techniques: A list of tuples of three elements: [ (Method Description, Imbalance Ratio, Algorithm Object) ]
oversamplers = []
lst = [ ("ROS-" + str(int(imb_ratio*100)), imb_ratio, RandomOverSampler(sampling_strategy=imb_ratio, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
oversamplers.extend(lst)
lst = [ ("SMOTE-" + str(int(imb_ratio*100)), imb_ratio, SMOTE(sampling_strategy=imb_ratio, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
oversamplers.extend(lst)
lst = [ ("BORSMOTE-" + str(int(imb_ratio*100)), imb_ratio, BorderlineSMOTE(sampling_strategy=imb_ratio, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
oversamplers.extend(lst)
lst = [ ("SVMSMOTE-" + str(int(imb_ratio*100)), imb_ratio, SVMSMOTE(sampling_strategy=imb_ratio, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
oversamplers.extend(lst)
lst = [ ("CLUSMOTE-" + str(int(imb_ratio*100)), imb_ratio, KMeansSMOTE(sampling_strategy=imb_ratio, cluster_balance_threshold=0.01, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
oversamplers.extend(lst)
lst = [ ("ADASYN-" + str(int(imb_ratio*100)), imb_ratio, ADASYN(sampling_strategy=imb_ratio, random_state=seed)) for imb_ratio in np.arange(alpha, 1.01, step) ]
oversamplers.extend(lst)
#print(oversamplers)
from tqdm.notebook import tqdm
Results_hybrid = []
for U in tqdm(undersamplers):
# print("")
for O in oversamplers:
t0 = time.time()
scenario = U[0] + " + " + O[0]
# print("Testing scenario..." + scenario + "...\t", end="", flush=True)
try:
X_1_train, y_1_train = U[2].fit_resample(X_train, y_train)
except:
# print(" -> No oversampling", end="", flush=True)
X_1_train = X_train
y_1_train = y_train
# print(str(len(y_1_train[y_1_train == 1])) + "/" + str(len(y_1_train[y_1_train == 0])))
try:
X_2_train, y_2_train = O[2].fit_resample(X_1_train, y_1_train)
except:
# print(" -> No oversampling", end="", flush=True)
X_2_train = X_1_train
y_2_train = y_1_train
for mdl in models:
# print("Training", mdl + "...\t", end="", flush=True)
clf = models[mdl]
clf.fit(X_2_train, y_2_train)
# print(" (%5.3f sec). \t" % (time.time() - t0), end="", flush=True)
y_predicted = clf.predict(X_test)
acc = accuracy_score(y_test, y_predicted)
bacc = balanced_accuracy_score(y_test, y_predicted)
Results_hybrid.append([ scenario, mdl, acc, bacc ])
# print("Accuracy=%5.4f" % acc, "\tBalanced Accuracy=%5.4f"% bacc, flush=True)
# print(" (%5.3f sec). \t" % (time.time() - t0), flush=True)
df_results = pd.DataFrame(Results_hybrid, columns=['Scenario', 'Classifier', 'Accuracy', 'BalancedAccuracy'])
df_results.to_csv('hybrid_underover_results.csv')
0%| | 0/30 [00:00<?, ?it/s]
df = pd.read_csv('hybrid_underover_results.csv')
fig = plt.figure(figsize=(18, 14))
# Plot the 20 best hybrid sampling scenario (in terms of accuracy)
# df_top20_acc = df.sort_values('Accuracy', ascending=False)
df_best = df.groupby('Scenario', as_index=False).mean().sort_values('Accuracy', ascending=False).head(20)
df_best = df_best.melt(id_vars=['Scenario'], value_vars=['Accuracy', 'BalancedAccuracy']).rename(columns=str.title)
df_best = df_best.sort_values(['Variable', 'Value'], ascending=[True, False])
plt.subplot(2, 1, 1)
ax = sns.barplot(data=df_best, x='Scenario', y='Value', hue='Variable', hue_order=['Accuracy', 'BalancedAccuracy'], edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.05), loc='right', borderaxespad=0, ncol=2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
# Plot the 20 best hybrid sampling scenario (in terms of balanced accuracy)
df_best = df.groupby('Scenario', as_index=False).mean().sort_values('BalancedAccuracy', ascending=False).head(20)
df_best = df_best.melt(id_vars=['Scenario'], value_vars=['Accuracy', 'BalancedAccuracy']).rename(columns=str.title)
df_best = df_best.sort_values(['Variable', 'Value'], ascending=[False, False])
plt.subplot(2, 1, 2)
ax = sns.barplot(data=df_best, x='Scenario', y='Value', hue='Variable', hue_order=['Accuracy', 'BalancedAccuracy'], edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.05), loc='right', borderaxespad=0, ncol=2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplots_adjust(hspace=0.6)
df = pd.read_csv('hybrid_underover_results.csv')
num_models = len(models)
fig = plt.figure(figsize=(18, num_models * 6))
itr=0
for mdl in models:
df_top20_acc = df[df.Classifier==mdl].sort_values(by='BalancedAccuracy', ascending=False).head(20)
df_best= df_top20_acc.melt(id_vars=['Scenario'], value_vars=['Accuracy', 'BalancedAccuracy']).rename(columns=str.title)
df_best = df_best.sort_values(['Variable', 'Value'], ascending=[False, False])
itr=itr+1
plt.subplot(num_models, 1, itr)
plt.title(mdl)
ax = sns.barplot(data=df_best, x='Scenario', y='Value', hue='Variable', hue_order=['Accuracy', 'BalancedAccuracy'], edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.07), loc='right', borderaxespad=0, ncol=2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplots_adjust(hspace=1.0)
First shrink the majority class with undersampling, then enlarge the minority classes by applying oversampling.
Results_hybrid = []
for O in tqdm(oversamplers):
for U in undersamplers:
t0 = time.time()
scenario = O[0] + " + " + U[0]
# print("Testing scenario..." + scenario + "...\t", end="", flush=True)
try:
X_1_train, y_1_train = O[2].fit_resample(X_train, y_train)
except:
# print(" -> No oversampling", end="", flush=True)
X_1_train = X_train
y_1_train = y_train
# print(str(len(y_1_train[y_1_train == 1])) + "/" + str(len(y_1_train[y_1_train == 0])))
try:
X_2_train, y_2_train = U[2].fit_resample(X_1_train, y_1_train)
except:
# print(" -> No undersampling", end="", flush=True)
X_2_train = X_1_train
y_2_train = y_1_train
for mdl in models:
# print("Training", mdl + "...\t", end="", flush=True)
clf = models[mdl]
clf.fit(X_2_train, y_2_train)
# print(" (%5.3f sec). \t" % (time.time() - t0), end="", flush=True)
y_predicted = clf.predict(X_test)
acc = accuracy_score(y_test, y_predicted)
bacc = balanced_accuracy_score(y_test, y_predicted)
Results_hybrid.append([ scenario, mdl, acc, bacc ])
# print("Accuracy=%5.4f" % acc, "\tBalanced Accuracy=%5.4f"% bacc, flush=True)
# print(" (%5.3f sec). \t" % (time.time() - t0), flush=True)
df_results = pd.DataFrame(Results_hybrid, columns=['Scenario', 'Classifier', 'Accuracy', 'BalancedAccuracy'])
df_results.to_csv('hybrid_overunder_results.csv')
0%| | 0/36 [00:00<?, ?it/s]
df = pd.read_csv('hybrid_overunder_results.csv')
fig = plt.figure(figsize=(18, 14))
# Plot the 20 best hybrid sampling scenario (in terms of accuracy)
# df_top20_acc = df.sort_values('Accuracy', ascending=False)
df_best = df.groupby('Scenario', as_index=False).mean().sort_values('Accuracy', ascending=False).head(20)
df_best = df_best.melt(id_vars=['Scenario'], value_vars=['Accuracy', 'BalancedAccuracy']).rename(columns=str.title)
df_best = df_best.sort_values(['Variable', 'Value'], ascending=[True, False])
plt.subplot(2, 1, 1)
ax = sns.barplot(data=df_best, x='Scenario', y='Value', hue='Variable', hue_order=['Accuracy', 'BalancedAccuracy'], edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.05), loc='right', borderaxespad=0, ncol=2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
# Plot the 20 best hybrid sampling scenario (in terms of balanced accuracy)
df_best = df.groupby('Scenario', as_index=False).mean().sort_values('BalancedAccuracy', ascending=False).head(20)
df_best = df_best.melt(id_vars=['Scenario'], value_vars=['Accuracy', 'BalancedAccuracy']).rename(columns=str.title)
df_best = df_best.sort_values(['Variable', 'Value'], ascending=[False, False])
plt.subplot(2, 1, 2)
ax = sns.barplot(data=df_best, x='Scenario', y='Value', hue='Variable', hue_order=['Accuracy', 'BalancedAccuracy'], edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.05), loc='right', borderaxespad=0, ncol=2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplots_adjust(hspace=0.6)
df = pd.read_csv('hybrid_overunder_results.csv')
num_models = len(models)
fig = plt.figure(figsize=(18, num_models * 6))
itr=0
for mdl in models:
df_top20_acc = df[df.Classifier==mdl].sort_values(by='BalancedAccuracy', ascending=False).head(20)
df_best= df_top20_acc.melt(id_vars=['Scenario'], value_vars=['Accuracy', 'BalancedAccuracy']).rename(columns=str.title)
df_best = df_best.sort_values(['Variable', 'Value'], ascending=[False, False])
itr=itr+1
plt.subplot(num_models, 1, itr)
plt.title(mdl)
ax = sns.barplot(data=df_best, x='Scenario', y='Value', hue='Variable', hue_order=['Accuracy', 'BalancedAccuracy'], edgecolor='white')
plt.legend(bbox_to_anchor=(1, 1.07), loc='right', borderaxespad=0, ncol=2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for i in ax.containers:
ax.bar_label(i, fmt='%.3f', fontsize=8)
plt.subplots_adjust(hspace=1.0)